library(data.table)

ngene = 100

expr_male = fread("/home/esaha/BONOBO/data/GTEx_thyroid/GTEx_thyroid_male.txt")
expr_female = fread("/home/esaha/BONOBO/data/GTEx_thyroid/GTEx_thyroid_female.txt")

expr = cbind(expr_male[,-1], expr_female[,-1])
rownames(expr) = expr_male$gene

# expr = expr_male[,-1]
# expr = expr_female[,-1]

# Compute mean expression
expr_mean = rowMeans(expr)

# Get most variable and least variable genes
expr_var = apply(expr, MARGIN = 1, sd)

# Most variable genes
var_most = order(expr_var, decreasing = T)[1:(ngene/2)]
# Most variable genes
var_least = order(expr_var, decreasing = F)[1:(ngene/2)]

# Genes to be used in simulation
gene_sim = c(var_most, var_least)

# Expression of genes used in simulation
expr_sim = expr[gene_sim,]

# Mean and covariance for simulation
sim_mean = rowMeans(expr_sim)
sim_cov = cov(t(expr_sim))

write.table(sim_mean, paste("/home/esaha/BONOBO/data/simulation_simple/simulation_mean", ngene ,".csv", sep = ""), row.names = F)
write.table(sim_cov, paste("/home/esaha/BONOBO/data/simulation_simple/simulation_covariance", ngene, ".csv", sep = ""))

# write.table(sim_mean, paste("/home/esaha/BONOBO/data/simulation/simulation_mean_male", ngene ,".csv", sep = ""), row.names = F)
# write.table(sim_cov, paste("/home/esaha/BONOBO/data/simulation/simulation_covariance_male", ngene, ".csv", sep = ""))

# write.table(sim_mean, paste("/home/esaha/BONOBO/data/simulation/simulation_mean_female", ngene ,".csv", sep = ""), row.names = F)
# write.table(sim_cov, paste("/home/esaha/BONOBO/data/simulation/simulation_covariance_female", ngene, ".csv", sep = ""))


# Plot mean and covariance between male and female

mean_expression = c(sim_mean_male, sim_mean_female)
gender = c(rep("MALE", length(sim_mean_male)), rep("FEMALE", length(sim_mean_female)))
fulldata = data.frame(cbind(mean_expression), gender)

library(ggplot2)
ggplot(fulldata, aes(y=mean_expression, x= gender, color = gender)) + geom_boxplot() + ggtitle("Top 50 most variable and top 50 least variable genes")
heatmap(sim_cov_male, main = "Covariance matrix: Male")
heatmap(sim_cov_female, main = "Covariance matrix: Female")


##############################################################
# Get mean and variance for mixture distributions
set.seed(10)

dim(expr_sim)
mix_prop = 0.9 # Mixture proportion of the smaller subgroup
npart = 100 # Number of partitions to choose from

samp1_list = list()
meandiff = rep(0, npart)
for (i in 1:npart){
  allsample = 1:ncol(expr_sim)
  size1 = floor(ncol(expr_sim)*mix_prop)
  samp1 = sample(allsample, size1)
  samp2 = setdiff(allsample, samp1)
  expr_samp1 = data.frame(expr_sim)[,samp1]
  expr_samp2 = data.frame(expr_sim)[,samp2]
  mean1 = rowMeans(expr_samp1)
  mean2 = rowMeans(expr_samp2)
  meandiff[i] = sum((mean1-mean2)^2)
  samp1_list[[i]] = samp1
}

# Find partition with maximum difference in mean
samp1_final = samp1_list[[which.max(meandiff)]]
samp2_final = setdiff(allsample, samp1_final)
expr1_final = data.frame(expr_sim)[,samp1_final]
expr2_final = data.frame(expr_sim)[,samp2_final]
mean1_final = rowMeans(expr1_final)
mean2_final = rowMeans(expr2_final)
cov1_final = cov(t(expr1_final))
cov2_final = cov(t(expr2_final))

write.table(mean1_final, paste("/home/esaha/BONOBO/data/simulation_mixture/dataMarch24/simulation_mean1_prop",  mix_prop,".csv", sep = ""), row.names = F)
write.table(cov1_final, paste("/home/esaha/BONOBO/data/simulation_mixture/dataMarch24/simulation_covariance1_prop", mix_prop, ".csv", sep = ""))
write.table(mean2_final, paste("/home/esaha/BONOBO/data/simulation_mixture/dataMarch24/simulation_mean2_prop",  mix_prop,".csv", sep = ""), row.names = F)
write.table(cov2_final, paste("/home/esaha/BONOBO/data/simulation_mixture/dataMarch24/simulation_covariance2_prop", mix_prop, ".csv", sep = ""))

